__kernel void matrix_mul(__global float* inputA,
                             __global float* inputB,
                             __global float* outputC,
                             int widthA,
                             int heightA,
                             int widthB,
                             int heightB)
{
    //Get global position in Y direction
    int row = get_global_id(1);

    //Get global position in X direction
    int col = get_global_id(0);

    //Calculate result of one element of Matrix C
    float sum = 0.0f;
    for (int j=0; j<widthA; j++){
        sum += inputA[row*widthA+j] * inputB[j*widthB+col];
    }
    outputC[row*widthB+col] = sum;
}
